Data set 1: 10X purified cells
X = readMM("../data/10X_pbmc_filtered/matrix.mtx")
X = as.matrix(X)
X = t(X)
label = read.table("../data/10X_pbmc_filtered/truelabel.tsv")$V1
rm(orig)
df = preprocess(X, label=label, normalize = FALSE)
df2 = preprocess(X, label = label, normalize = TRUE)
rm(X); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1749429 93.5 3683070 196.7 2783514 148.7
## Vcells 4756188 36.3 1096746803 8367.6 1784982126 13618.4
table(label)
## label
## B cd34 cytotoxicT helperT
## 1355 3094 1906 1035
## memoryT monocytes naivecytotoxicT naiveT
## 1716 88 1292 456
## NK regulatoryT
## 1443 689
tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0") +
ggtitle("dropout rates vs gene mean")
tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("normalized")
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("B", "helperT")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("two celltypes selected")
tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("B", "helperT")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("normalized")
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))
df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2
fitdf = df %>% group_by(celltype) %>% do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)
fitted = list()
newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
tmp = df %>% filter(celltype==names(table(label))[t])
tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]
ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g1 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) +
geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
ylab("dropout rate") +
xlab("gene mean") +
ylim(c(0,1)) +
ggtitle("fitted exponential decay")
df = df2
df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))
df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2
fitdf = df %>% group_by(celltype) %>% do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)
fitted = list()
newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
tmp = df %>% filter(celltype==names(table(label))[t])
tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]
ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g2 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) +
geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
ylab("dropout rate") +
xlab("gene mean") +
ggtitle("normalized") +
ylim(c(0,1))
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

Data set 2 : 10X 68K cells
X = readMM("../data/10X_68K/filtered_matrix/hg19/matrix.mtx")
barcodes = read.table("../data/10X_68K/filtered_matrix/hg19/barcodes.tsv", sep = "\t")
genes = read.table("../data/10X_68K/filtered_matrix/hg19/genes.tsv", sep = "\t")
annotation = read.table("../data/10X_68K/filtered_matrix/hg19/barocdes_annotation.tsv", sep = "\t", header = TRUE)
label = annotation$celltype
X = t(as.matrix(X))
df = preprocess(X, label=label, normalize = FALSE)
df2 = preprocess(X, label = label, normalize = TRUE)
rm(X); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2064127 110.3 3683070 196.7 3683070 196.7
## Vcells 6484855 49.5 5825166023 44442.5 7584762096 57867.2
table(label)
## label
## CD14+ Monocyte CD19+ B
## 2862 5908
## CD34+ CD4+ T Helper2
## 277 97
## CD4+/CD25 T Reg CD4+/CD45RA+/CD25- Naive T
## 6187 1873
## CD4+/CD45RO+ Memory CD56+ NK
## 3061 8776
## CD8+ Cytotoxic T CD8+/CD45RA+ Naive Cytotoxic
## 20773 16666
## Dendritic
## 2099
tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0") +
ggtitle("dropout rates vs gene mean")
tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("normalized")
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("CD19+ B", "CD4+/CD45RO+ Memory")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("two celltypes selected")
tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("CD19+ B", "CD4+/CD45RO+ Memory")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("normalized")
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))
df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2
fitdf = df %>% group_by(celltype) %>% do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)
fitted = list()
newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
tmp = df %>% filter(celltype==names(table(label))[t])
tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]
ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g1 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) +
geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
ylab("dropout rate") +
xlab("gene mean") +
ylim(c(0,1)) +
ggtitle("fitted exponential decay")
df = df2
df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))
df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2
fitdf = df %>% group_by(celltype) %>% do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)
fitted = list()
newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
tmp = df %>% filter(celltype==names(table(label))[t])
tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]
ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g2 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) +
geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
ylab("dropout rate") +
xlab("gene mean") +
ggtitle("normalized") +
ylim(c(0,1))
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

Data Set 3 : GSE76381 Developmental Midbrain
x = fread("../data/GSE76381/GSE76381_ESMoleculeCounts.cef.txt.gz")
X = x[-(1:4), -2]
X = matrix(NA, nrow(X)-1, ncol(X) - 1)
label = as.character(x[3, -(1:2)])
X = (as.matrix(x[6:nrow(x), 3:ncol(x)]))
X = apply(X, 2, as.numeric)
rownames(X) = x$V1[-(1:5)]
colnames(X) = x[2, -(1:2)]
rm(x); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2144155 114.6 3683070 196.7 3683070 196.7
## Vcells 65613692 500.6 4660132818 35554.0 7584762096 57867.2
X = t(X)
df = preprocess(X, label=label, normalize = FALSE)
df2 = preprocess(X, label = label, normalize = TRUE)
rm(X); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2138958 114.3 3683070 196.7 3683070 196.7
## Vcells 35430539 270.4 977300684 7456.3 7584762096 57867.2
table(label)
## label
## eNb1 eNb2 eNb3 eNb4 eProg1a eProg1b eProg2a eProg2b eRgla
## 121 84 42 45 244 263 126 118 67
## eRglb eRglc eRgld eRgle eRglf eSCa eSCb eSCc
## 68 55 47 45 69 126 117 78
tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0") +
ggtitle("dropout rates vs gene mean")
tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("normalized")
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("eNb1", "eSCa")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("two celltypes selected")
tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("eNb1", "eSCa")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("normalized")
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))
df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2
fitdf = df %>% group_by(celltype) %>% do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)
fitted = list()
newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
tmp = df %>% filter(celltype==names(table(label))[t])
tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]
ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g1 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) +
geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
ylab("dropout rate") +
xlab("gene mean") +
ylim(c(0,1)) +
ggtitle("fitted exponential decay")
df = df2
df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))
df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2
fitdf = df %>% group_by(celltype) %>% do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)
fitted = list()
newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
tmp = df %>% filter(celltype==names(table(label))[t])
tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]
ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g2 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) +
geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
ylab("dropout rate") +
xlab("gene mean") +
ggtitle("normalized") +
ylim(c(0,1))
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

Data 4 : GSE114724 Diverse Immune Phenotypes in the Breast Tumor Microenvironment
subpart 1 : Patient BC09 Tumor 1
X = readRDS("../data/GSE114724/GSE114724_CLEANED/bc09_tumor1.rds")
label = read.table("../data/GSE114724/GSE114724_CLEANED/bc09_tumor1_label.txt", stringsAsFactors = FALSE)$V1
X = t(as.matrix(X))
df = preprocess(X, label=label, normalize = FALSE)
df2 = preprocess(X, label = label, normalize = TRUE)
rm(X); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2140890 114.4 3683070 196.7 3683070 196.7
## Vcells 34927001 266.5 500377949 3817.6 7584762096 57867.2
table(label)
## label
## B: MACROPHAGE: MAST:
## 664 310 10
## mDC: MONOCYTE:precursor NEUTROPHIL:
## 97 355 52
## NK:CD56+16+3- NK:CD56+16+3+NKT NKT
## 31 953 180
## T:CD4+CM T:CD4+NAIVE T:CD8+EM
## 249 1290 331
## T:CD8+NAIVE
## 765
tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0") +
ggtitle("dropout vs gene mean")
tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("normalized")
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("B:", "NEUTROPHIL:")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("not normalized")
tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("B:", "NEUTROPHIL:")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf,
aes(x=gene_mean, y=dropout_rate, col = celltype)) +
geom_point(alpha = 0.4) +
ylab("drop out rate") +
xlab("mean(X) for X>0")+
ggtitle("normalized")
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))
df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2
fitdf = df %>% group_by(celltype) %>% do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)
fitted = list()
newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
tmp = df %>% filter(celltype==names(table(label))[t])
tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]
ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g1 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) +
geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
ylab("dropout rate") +
xlab("gene mean") +
ylim(c(0,1)) +
ggtitle("not normalized")
df = df2
df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))
df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2
fitdf = df %>% group_by(celltype) %>% do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)
fitted = list()
newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
tmp = df %>% filter(celltype==names(table(label))[t])
tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]
ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g2 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) +
geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
ylab("dropout rate") +
xlab("gene mean") +
ggtitle("normalized") +
ylim(c(0,1))
ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)
